VAE_CONFIG = {
    "in_channels": 3,
    "latent_channels": 32,
    "pretrained_source": "dc-ae",
    "scaling_factor": 0.41407,
    "encoder": {
        "in_channels": 3,
        "latent_channels": 32,
        "width_list": [128, 256, 512, 512, 1024, 1024],
        "depth_list": [2, 2, 2, 3, 3, 3],
        "block_type": [
            "ResBlock",
            "ResBlock",
            "ResBlock",
            "EViTS5_GLU",
            "EViTS5_GLU",
            "EViTS5_GLU",
        ],
        "norm": "rms2d",
        "act": "silu",
        "downsample_block_type": "Conv",
    },
    "decoder": {
        "in_channels": 3,
        "latent_channels": 32,
        "width_list": [128, 256, 512, 512, 1024, 1024],
        "depth_list": [3, 3, 3, 3, 3, 3],
        "block_type": [
            "ResBlock",
            "ResBlock",
            "ResBlock",
            "EViTS5_GLU",
            "EViTS5_GLU",
            "EViTS5_GLU",
        ],
        "norm": "rms2d",
        "act": "silu",
        "upsample_block_type": "InterpolateConv",
    },
}
